5 Functional traits

load("data/data.Rdata")
cluster_kegg_max <- cluster_kegg %>% 
  filter(!is.na(secondary_cluster)) %>%
  select(-overall_strategy) %>%
  group_by(secondary_cluster) %>%
  summarise(across(everything(), max, na.rm = TRUE))

5.1 Trait overview

phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(cluster_taxonomy, by=join_by(phylum == Phylum)) %>%
    filter(secondary_cluster %in% cluster_tree$tip.label) %>%
    arrange(match(secondary_cluster, cluster_tree$tip.label)) %>%
    select(secondary_cluster,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    filter(!is.na(secondary_cluster)) %>%
    column_to_rownames(var = "secondary_cluster") 

function_tree <- keep.tip(cluster_tree, tip=rownames(phylum_heatmap))

function_table <- cluster_kegg_max %>%
    filter(secondary_cluster %in% function_tree$tip.label) %>%
    column_to_rownames(var="secondary_cluster")

# Generate  basal tree
function_tree <- force.ultrametric(function_tree, method="extend") %>%
                ggtree(., size = 0.3) 
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

function_tree

## Functional ordination

Rtsne(X=cluster_kegg_max %>% column_to_rownames(var="secondary_cluster"), dims = 2, check_duplicates = FALSE)$Y %>%
  as.data.frame() %>%
  mutate(secondary_cluster=cluster_kegg_max$secondary_cluster) %>%
  rename(tSNE1=V1,tSNE2=V2) %>%
  left_join(cluster_taxonomy,by="secondary_cluster") %>%
  left_join(cluster_prevalence,by="secondary_cluster") %>%
  ggplot(aes(x = tSNE1, y = tSNE2, color = Phylum, size=n_strategy))+
        geom_point(shape=16, alpha=0.7) +
        scale_color_manual(values=phylum_colors) +
        theme_minimal() +
        labs(color="Phylum", size="Number of strategies") +
        guides(color = guide_legend(override.aes = list(size = 5)))

5.2 Trait recovery

all_strategy_clusters <- cluster_prevalence %>% 
  filter(n_strategy==10) %>% 
  pull(secondary_cluster)

# Filter clusters recovered in all strategies
cluster_kegg_max_filt <- cluster_kegg_max %>%
    filter(secondary_cluster %in% all_strategy_clusters) 

cluster_kegg_proportion_max <- cluster_kegg %>%
  filter(!is.na(secondary_cluster)) %>%
  filter(secondary_cluster %in% all_strategy_clusters) %>%
  group_by(secondary_cluster) %>%
  mutate(across(where(is.numeric), ~ . / max(., na.rm = TRUE))) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(average = rowMeans(across(where(is.numeric)), na.rm = TRUE)) %>%
  select(secondary_cluster,overall_strategy,average)
  
cluster_kegg_proportion_max %>%
  group_by(overall_strategy) %>%
  summarise(mean=mean(average),sd=sd(average)) %>%
  tt()
tinytable_m4rhvuw4cyoq6uyf3kv0
overall_strategy mean sd
coassembly_animal 0.9669740 0.03788229
coassembly_timepoint_all 0.9642462 0.04627303
coassembly_timepoint_cage 0.9695513 0.03515391
multicoverage_animal 0.9686745 0.03929870
multicoverage_timepoint_all 0.9453590 0.08602739
multicoverage_timepoint_cage 0.9704001 0.03229802
multisplit_animal 0.9595563 0.03673926
multisplit_timepoint_all 0.9760940 0.02945151
multisplit_timepoint_cage 0.9589287 0.03776599
single_coverage 0.9622317 0.03741529
cluster_kegg_proportion_max %>%
  left_join(cluster_taxonomy,by="secondary_cluster") %>%
  ggplot(aes(x = average, y = overall_strategy, group=overall_strategy, color=Phylum)) +
      geom_boxplot(color="#999999", fill="#f4f4f4", outlier.shape = NA) +
      scale_color_manual(values=phylum_colors[-c(1,4,6,7)]) +
      xlim(0.8, 1)+
      geom_jitter(alpha=0.3) +
      theme_classic() +
      theme() +
      labs(x="Function recovery", y="Strategy")

cluster_kegg_proportion_max %>%
  mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label[cluster_tree$tip.label %in% cluster_kegg_proportion_max$secondary_cluster]))) %>%
  ggplot(aes(x=secondary_cluster,y=overall_strategy,fill=average))+
    geom_tile()+
    scale_fill_distiller(palette = "YlGnBu", direction = -1) +
    theme_minimal() +
    theme(axis.text.x = element_blank(),
        axis.title.y = element_blank()) +
    labs(y="Strategy",x="Clusters",fill="Function recovery")

completeness_stats <- bin_metadata %>%
  select(genome,completeness,contamination,overall_strategy, assembly, size) %>%
  group_by(overall_strategy) %>%
  summarise(completeness=mean(completeness))

functions_stats <- cluster_kegg_proportion_max %>%
  group_by(overall_strategy) %>%
  summarise(functions=mean(average))

full_join(completeness_stats,functions_stats,by="overall_strategy") %>%
  ggplot(aes(x=functions,y=completeness, color=overall_strategy))+
    geom_point()+
    scale_color_manual(values=strategy_colors)+
    theme_minimal()